{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sentiment Classfication By Word Averaging"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "import random\n",
    "\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "import torch.optim as optim\n",
    "from collections import Counter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "random.seed(2019)\n",
    "# 使用benchmark以启动CUDNN_FIND自动寻找最快的操作，\n",
    "# 当计算图不会改变的时候（每次输入形状相同，模型不改变）的情况下可以提高性能，反之则降低性能。\n",
    "# torch.backends.cudnn.deterministic = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "VOCAB_SIZE = 14_828\n",
    "\n",
    "EPOCHS = 5\n",
    "BATCH_SIZE = 32\n",
    "LEARNING_RATE = 0.01\n",
    "BEST_VALID_LOSS = float('inf')\n",
    "\n",
    "EMBEDDING_DIM = 100\n",
    "OUTPUT_DIM = 1\n",
    "\n",
    "train_file = \"data/senti.train.tsv\"\n",
    "eval_file = \"data/senti.dev.tsv\"\n",
    "test_file = \"data/senti.test.tsv\"\n",
    "\n",
    "USE_CUDA = torch.cuda.is_available()\n",
    "DEVICE = torch.device('cuda:2' if USE_CUDA else 'cpu')\n",
    "NUM_CUDA = torch.cuda.device_count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_text_file(filename):\n",
    "    \"\"\"将样本的特征与标签分开，并将样本特征分词\"\"\"\n",
    "    sentences = []\n",
    "    label = []\n",
    "    with open(filename, \"r\") as f:\n",
    "        sent_list  = [line.strip().split('\\t') for line in f]\n",
    "    for sample in sent_list:\n",
    "        sentences.append(sample[0].lower().split(\" \"))\n",
    "        label.append(int(sample[-1]))\n",
    "    return sentences, label\n",
    "\n",
    "\n",
    "def build_word_dic(sentences_list, vocab_size=20_000):\n",
    "    \"\"\"构建words_set, word2idx, idx2word\"\"\"\n",
    "    words_list = [w for line in sentences_list for w in line]\n",
    "    counter = Counter(words_list)\n",
    "    words_topn = counter.most_common(vocab_size)\n",
    "    words_set = [item[0] for item in words_topn]\n",
    "    words_set = ['<pad>', \"<unk>\"] + words_set\n",
    "    word2idx = {w:i for i, w in enumerate(words_set)}\n",
    "    idx2word = {i:w for i, w in enumerate(words_set)}\n",
    "    return words_topn, word2idx, idx2word\n",
    "\n",
    "\n",
    "def build_x_y(word2idx, sentences_list, label_list, sent_len=30):\n",
    "    \"\"\"构建输入模型的数据，对每个单词编码，每个句子通过添加pading保持一样长\"\"\"\n",
    "    x = []\n",
    "    y = []\n",
    "    for sent, label in zip(sentences_list, label_list):\n",
    "        word_x = [0]*sent_len\n",
    "        if len(sent) > sent_len:\n",
    "            sent = sent[:sent_len]\n",
    "        for i, w in enumerate(sent):\n",
    "            if w in word2idx:\n",
    "                word_x[i] = word2idx[w]\n",
    "            else:\n",
    "                word_x[i] = word2idx['<unk>']\n",
    "        x.append(word_x)\n",
    "        y.append(label)\n",
    "    return x, y\n",
    "\n",
    "# 构造批次数据\n",
    "def build_batch_data(data, label, batch_size=32):\n",
    "    \"\"\"构建 batch tensor，返回 batch 列表，每个batch为二元组包含data和label\"\"\"\n",
    "    batch_data = []\n",
    "    data_tensor = torch.tensor(data, dtype=torch.long)\n",
    "    label_tensor = torch.tensor(label, dtype=torch.float)\n",
    "    n, dim = data_tensor.size()\n",
    "    for start in range(0, n, batch_size):\n",
    "        end = start + batch_size\n",
    "        if end > n:\n",
    "            break\n",
    "            dbatch = data_tensor[start: ]\n",
    "            lbatch = label_tensor[start: ]\n",
    "            print(\"最后一个batch size:\", dbatch.size())\n",
    "        else:\n",
    "            dbatch = data_tensor[start: end]\n",
    "            lbatch = label_tensor[start: end]\n",
    "        batch_data.append((dbatch, lbatch))\n",
    "    return batch_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_sentences, train_label = load_text_file(train_file)\n",
    "eval_sentences, eval_label = load_text_file(eval_file)\n",
    "test_sentences, test_label = load_text_file(test_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "处理后的样本与标签： ['hide', 'new', 'secretions', 'from', 'the', 'parental', 'units'] 0\n",
      "各个数据集样本数量：\n",
      "67349 67349\n",
      "872 872\n",
      "1821 1821\n",
      "各数据集最长最短句子单词数：\n",
      "52 1\n",
      "47 2\n",
      "56 2\n"
     ]
    }
   ],
   "source": [
    "print(\"处理后的样本与标签：\", train_sentences[0], train_label[0])\n",
    "print(\"各个数据集样本数量：\")\n",
    "print(len(train_sentences), len(train_label))\n",
    "print(len(eval_sentences), len(eval_label))\n",
    "print(len(test_sentences), len(test_label))\n",
    "\n",
    "print(\"各数据集最长最短句子单词数：\")\n",
    "print(max([len(s) for s in train_sentences]), min([len(s) for s in train_sentences]))\n",
    "print(max([len(s) for s in eval_sentences]), min([len(s) for s in eval_sentences]))\n",
    "print(max([len(s) for s in test_sentences]), min([len(s) for s in test_sentences]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "max_seq_len = 56\n",
    "words_set, word2idx, idx2word = build_word_dic(train_sentences, vocab_size=VOCAB_SIZE)\n",
    "train_x, train_y = build_x_y(word2idx, train_sentences, train_label,sent_len=max_seq_len)\n",
    "eval_x, eval_y = build_x_y(word2idx, eval_sentences, eval_label,sent_len=max_seq_len)\n",
    "test_x, test_y = build_x_y(word2idx, test_sentences, test_label,sent_len=max_seq_len)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "词典长度: 14828 14830 14830\n",
      "训练集样本数量: 67349 67349\n"
     ]
    }
   ],
   "source": [
    "print(\"词典长度:\", len(words_set), len(word2idx), len(idx2word))\n",
    "print(\"训练集样本数量:\", len(train_x), len(train_y))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data = build_batch_data(train_x, train_y, batch_size=BATCH_SIZE)\n",
    "eval_data = build_batch_data(eval_x, eval_y, batch_size=BATCH_SIZE)\n",
    "test_data = build_batch_data(test_x, test_y, batch_size=BATCH_SIZE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Word Averaging Model\n",
    "class WordAVGModel(nn.Module):\n",
    "    def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx):\n",
    "        super(WordAVGModel, self).__init__()\n",
    "        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)\n",
    "        self.fc = nn.Linear(embedding_dim, output_dim)\n",
    "        \n",
    "    def forward(self, data):\n",
    "        # print(\"data\", data.size())\n",
    "        embedded = self.embedding(data) # [sent len, batch size, emb dim]\n",
    "        # print(\"embdded\", embedded.size())\n",
    "        # embedded = embedded.permute(1, 0, 2) # [batch size, sent len, emb dim]\n",
    "        # print(\"embdded2:\", embedded.size())\n",
    "        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1) # [batch size, embedding_dim]\n",
    "        # print(\"poold:\", pooled.size())\n",
    "        score = self.fc(pooled)\n",
    "        return score\n",
    "    \n",
    "def binary_accuracy(preds, y):\n",
    "    \"\"\"\n",
    "    计算准确率\n",
    "    \"\"\"\n",
    "    rounded_preds = torch.round(torch.sigmoid(preds))\n",
    "    correct = (rounded_preds == y).float()  \n",
    "    acc = correct.sum()/len(correct)\n",
    "    return acc\n",
    "\n",
    "def train(model, device, iterator, optimizer, criterion):\n",
    "    \"\"\"训练函数\"\"\"\n",
    "    \n",
    "    epoch_loss = 0\n",
    "    epoch_acc = 0\n",
    "    model.train()\n",
    "    \n",
    "    for x, y in iterator:\n",
    "        x, y = x.to(device), y.to(device) # torch.int64\n",
    "        optimizer.zero_grad()\n",
    "        predictions = model(x).squeeze(1)  # torch.float32 \n",
    "        \n",
    "        loss = criterion(predictions, y)\n",
    "        acc = binary_accuracy(predictions, y)\n",
    "        loss.backward()\n",
    "        optimizer.step()\n",
    "        \n",
    "        epoch_loss += loss.item()\n",
    "        epoch_acc += acc.item()\n",
    "        \n",
    "    return epoch_loss / len(iterator), epoch_acc / len(iterator)\n",
    "\n",
    "def evaluate(model, device, iterator, criterion):\n",
    "    \"\"\"验证函数\"\"\"\n",
    "    epoch_loss = 0\n",
    "    epoch_acc = 0\n",
    "    model.eval()\n",
    "    \n",
    "    with torch.no_grad():\n",
    "        for x, y in iterator:\n",
    "            x, y = x.to(device), y.to(device)\n",
    "            predictions = model(x).squeeze(1)\n",
    "            loss = criterion(predictions, y)\n",
    "            acc = binary_accuracy(predictions, y)\n",
    "            epoch_loss += loss.item()\n",
    "            epoch_acc += acc.item()\n",
    "        \n",
    "    return epoch_loss / len(iterator), epoch_acc / len(iterator)\n",
    "\n",
    "def count_parameters(model):\n",
    "    \"\"\"统计模型的参数量\"\"\"\n",
    "    return sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
    "\n",
    "def epoch_time(start_time, end_time):\n",
    "    \"\"\"计算时间差，单位秒\"\"\"\n",
    "    elapsed_time = end_time - start_time\n",
    "    elapsed_mins = int(elapsed_time / 60)\n",
    "    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))\n",
    "    return elapsed_mins, elapsed_secs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "模型有1,483,101个可调节参数, 大约5.657581329345703 M.\n"
     ]
    }
   ],
   "source": [
    "INPUT_DIM = len(words_set) + 2\n",
    "PAD_IDX = word2idx['<pad>']\n",
    "\n",
    "model = WordAVGModel(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)\n",
    "print(f'模型有{count_parameters(model):,}个可调节参数, 大约{count_parameters(model)*4/1024/1024} M.')\n",
    "\n",
    "model = model.to(DEVICE)\n",
    "\n",
    "# 使用多块GPU\n",
    "# if NUM_CUDA > 1:\n",
    "#     device_ids = list(range(NUM_CUDA))\n",
    "#     print(device_ids)\n",
    "#     model = nn.DataParallel(model, device_ids=device_ids)\n",
    "optimizer = optim.Adam(model.parameters(),lr=LEARNING_RATE)\n",
    "criterion = nn.BCEWithLogitsLoss()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 训练模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/root/anaconda3/lib/python3.6/site-packages/torch/serialization.py:251: UserWarning: Couldn't retrieve source code for container of type WordAVGModel. It won't be checked for correctness upon loading.\n",
      "  \"type \" + obj.__name__ + \". It won't be checked \"\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "***Save Best Model wordavg-model.pth***\n",
      "Epoch: 01 | Epoch Time: 0m 7s\n",
      "\tTrain Loss: 0.354 | Train Acc: 84.66%\n",
      "\t Val. Loss: 0.578 |  Val. Acc: 81.25%\n",
      "Epoch: 02 | Epoch Time: 0m 6s\n",
      "\tTrain Loss: 0.216 | Train Acc: 91.85%\n",
      "\t Val. Loss: 0.704 |  Val. Acc: 82.18%\n",
      "Epoch: 03 | Epoch Time: 0m 5s\n",
      "\tTrain Loss: 0.182 | Train Acc: 93.14%\n",
      "\t Val. Loss: 0.815 |  Val. Acc: 81.60%\n",
      "Epoch: 04 | Epoch Time: 0m 6s\n",
      "\tTrain Loss: 0.164 | Train Acc: 93.96%\n",
      "\t Val. Loss: 0.921 |  Val. Acc: 81.13%\n",
      "Epoch: 05 | Epoch Time: 0m 6s\n",
      "\tTrain Loss: 0.152 | Train Acc: 94.41%\n",
      "\t Val. Loss: 1.029 |  Val. Acc: 80.32%\n"
     ]
    }
   ],
   "source": [
    "model_name = 'wordavg-model.pth'\n",
    "for epoch in range(1, EPOCHS+1):\n",
    "    start_time = time.time()\n",
    "    train_loss, train_acc = train(model, DEVICE, train_data, optimizer, criterion)\n",
    "    valid_loss, valid_acc = evaluate(model, DEVICE, eval_data, criterion)\n",
    "    end_time = time.time()\n",
    "\n",
    "    epoch_mins, epoch_secs = epoch_time(start_time, end_time)\n",
    "    if valid_loss < BEST_VALID_LOSS:\n",
    "        BEST_VALID_LOSS = valid_loss\n",
    "        torch.save(model, model_name)\n",
    "        print(f'***Save Best Model {model_name}***')\n",
    "    \n",
    "    print(f'Epoch: {epoch :02} | Epoch Time: {epoch_mins}m {epoch_secs}s')\n",
    "    print(f'\\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')\n",
    "    print(f'\\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 测试集上的表现"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Test Loss: 0.5238943193107843 | Test Acc: 0.8052455357142857 |\n"
     ]
    }
   ],
   "source": [
    "model = torch.load(model_name)\n",
    "test_loss, test_acc = evaluate(model, DEVICE, test_data, criterion)\n",
    "print('Test Loss: {0} | Test Acc: {1} |'.format(test_loss, test_acc))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 计算词向量L2 Norm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Embed size: torch.Size([14830, 100])\n"
     ]
    }
   ],
   "source": [
    "embed = model.embedding.weight.data\n",
    "print(\"Embed size:\", embed.size())\n",
    "word_l2norm = torch.norm(embed,  dim=1)\n",
    "embed_l2norm, embed_l2normnorm_idx = word_l2norm.sort()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "L2 norm 最小的 15 个单词：\n",
      "<pad> 0.0\n",
      "urges 7.781852722167969\n",
      "diggs 7.7840423583984375\n",
      "flowers 7.9071125984191895\n",
      "specialized 7.996907711029053\n",
      "flows 8.031326293945312\n",
      "clear 8.063986778259277\n",
      "hijinks 8.064300537109375\n",
      "shaped 8.083154678344727\n",
      "farcically 8.089497566223145\n",
      "digital 8.095300674438477\n",
      "convenience 8.112516403198242\n",
      "theatrics 8.121846199035645\n",
      "superman 8.127737998962402\n",
      "shearer 8.129433631896973\n"
     ]
    }
   ],
   "source": [
    "print('L2 norm 最小的 15 个单词：')\n",
    "for i,s in zip(embed_l2normnorm_idx[:15].tolist(), embed_l2norm[:15].tolist()):\n",
    "    print(idx2word[i], s)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "L2 norm 最大的 15 个单词：\n",
      "refreshing 21.444095611572266\n",
      "unfunny 21.45452117919922\n",
      "touching 21.50410270690918\n",
      "wonderful 21.59657096862793\n",
      "devoid 21.63581657409668\n",
      "lousy 21.68600845336914\n",
      "pointless 21.707721710205078\n",
      "depressing 21.835289001464844\n",
      "hilarious 22.01133155822754\n",
      "worst 22.104299545288086\n",
      "failure 22.8591365814209\n",
      "stupid 23.103296279907227\n",
      "remarkable 23.48238182067871\n",
      "mess 24.341245651245117\n",
      "lacking 24.97127914428711\n"
     ]
    }
   ],
   "source": [
    "print('L2 norm 最大的 15 个单词：')\n",
    "for i,s in zip(embed_l2normnorm_idx[-15:].tolist(), embed_l2norm[-15:].tolist()):\n",
    "    print(idx2word[i],s)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
